作者:心悦随鑫_196 | 来源:互联网 | 2023-10-12 13:33
原文链接:http://tecdat.cn/?p=6181
原文出处:拓端数据部落公众号
Word Mover的距离(WMD)是用于衡量两个文档之间差异的距离度量,它在文本分析中的应用是由华盛顿大学的一个研究小组在2015年引入的。
Word Mover距离的定义
WMD是两个文档之间的距离,作为将所有单词从一个文档移动到另一个文档所需的最小(加权)累积成本。通过解决以下线性程序问题来计算距离。
T ij表示文档d中的单词i在文档d'中移动到单词j的多少;
C(1; j)的表示从文件d中的单词我到文件d '中的单词J‘行进’的费用; 这里的成本是word2vec嵌入空间中的两个词'欧几里德距离;
如果字我出现Ç我在文档d次,我们记
WMD是地球移动器距离度量(EMD)的一个特例,这是一个众所周知的运输问题。
如何用SAS计算地球移动的距离?
SAS / OR是解决运输问题的工具。图1显示了一个带有四个节点和节点之间距离的传输示例,我从这个Earth Mover的距离文档中复制了这些节点。目标是找出从{x1 ,x2}到{y1,y2}的最小流量。现在让我们看看如何使用SAS / OR解决这个运输问题。
节点的权重和节点之间的距离如下。
图-1运输问题
datax_set;input_node_ $ _sd_;datalines;
x10.74x20.26;datay_set;
input_node_ $ _sd_;
datalines;
y10.23y20.51;
dataarcdata;input_tail_ $ _head_ $ _cost_;datalines;
x1 y1155.7x1 y2252.3x2 y1292.9x2 y2198.2;proc optmodel;
setxNODES;
num w{xNODES};
setyNODES;
num u{yNODES};set ARCS;
num arcCost{ARCS};
readdatax_setintoxNODES=[_node_]w=_sd_;
readdatay_setintoyNODES=[_node_]u=_sd_;
readdataarcdataintoARCS=[_tail_ _head_]arcCost=_cost_;
varflow{inARCS}>=0;
impvar sumY =sum{jinyNODES}u[j];
minobj =(sum{inARCS}arcCost[i,j]* flow[i,j])/sumY;
con con_y{jinyNODES}:sum{inARCS}flow[i,j]= u[j];
con con_x{iinxNODES}:sum{<(i),j>inARCS}flow[i,j]<&#61; w[i];solve with lp / algorithm&#61;ns scale&#61;none logfreq&#61;1;print flow;quit;
SAS / OR的解决方案如表-1所示&#xff0c;EMD是目标值&#xff1a;203.26756757。
表-1 EMD用SAS / OR计算
我用SAS / OR表2得到的流量数据显示如下&#xff0c;与上述地球移动器距离文档中公布的图表相同。
表-2 SAS / OR的流量数据
图-2运输问题流程图
如何用SAS计算Word Mover的距离
本文从Word嵌入到文档距离&#xff0c;通过删除WMD的第二个约束来减少计算&#xff0c;提出了一个名为放松的Word Mover距离&#xff08;RWMD&#xff09;的新度量。由于我们需要读取文字嵌入数据&#xff0c;因此我将向您展示如何使用SAS Viya计算两个文档的RWMD。
/* start CAS server */cas casauto host&#61;"host.example.com"port&#61;5570;libnamesascas1 cas;/* load documents into CAS */datasascas1.documents;infiledatalines delimiter&#61;&#39;|&#39;missover;lengthtext varchar(300);inputtext$ did;datalines;Obama speaks to the mediainIllinois.|1The President greets the pressinChicago.|2;run;/* create stop list*/datasascas1.stopList;infiledatalines missover;lengthterm $20;inputterm$;datalines;thetoin;run;/* load word embedding model */proc cas;loadtable path&#61;&#39;datasources/glove_100d_tab_clean.txt&#39;caslib&#61;"CASTestTmp"importOptions&#61;{fileType&#61;"delimited",delimiter&#61;&#39;\t&#39;,getNames&#61;True,guessRows&#61;2.0,varChars&#61;True}casOut&#61;{name&#61;&#39;glove&#39;replace&#61;True};run;quit;%macrocalculateRWMD(textDS&#61;documents,documentID&#61;did,text&#61;text,language&#61;English,stopList&#61;stopList,word2VectDS&#61;glove,doc1_id&#61;1,doc2_id&#61;2);/* text parsing and aggregation */proc cas;textParse.tpParse/table&#61;{name&#61;"&textDS",where&#61;"&documentID&#61;&doc1_id or &documentID&#61;&doc2_id"}docId&#61;"&documentID",language&#61;"&language",stemming&#61;False,nounGroups&#61;False,tagging&#61;False,offset&#61;{name&#61;"outpos",replace&#61;1},text&#61;"&text";run; textparse.tpaccumulate/parent&#61;{name&#61;"outparent1",replace&#61;1}language&#61;"&language",offset&#61;&#39;outpos&#39;,stopList&#61;{name&#61;"&stoplist"},terms&#61;{name&#61;"outterms1",replace&#61;1},child&#61;{name&#61;"outchild1",replace&#61;1},reduce&#61;1,cellweight&#61;&#39;none&#39;,termWeight&#61;&#39;none&#39;;run;quit;/* terms of the two test documents */proc cas;loadactionset"fedsql";execdirect casout&#61;{name&#61;"doc_terms",replace&#61;true}query&#61;"select outparent1.*,_term_from outparent1left join outterms1on outparent1._termnum_ &#61; outterms1._termnum_where _Document_&#61;&doc1_id or _Document_&#61;&doc2_id;";run;quit;/* term vectors and counts of the two test documents */proc cas;loadactionset"fedsql";execdirect casout&#61;{name&#61;"doc1_termvects",replace&#61;true}query&#61;"select word2vect.*from &word2VectDS word2vect, doc_termswhere _Document_&#61;&doc2_id and lowcase(term) &#61; _term_;";run; execdirect casout&#61;{name&#61;"doc1_terms",replace&#61;true}query&#61;"select doc_terms.*from &word2VectDS, doc_termswhere _Document_&#61;&doc2_id and lowcase(term) &#61; _term_;";run; simple.groupBy /table&#61;{name&#61;"doc1_terms"}inputs&#61;{"_Term_","_Count_"}aggregator&#61;"n"casout&#61;{name&#61;"doc1_termcount",replace&#61;true};run;quit;proc cas;loadactionset"fedsql";execdirect casout&#61;{name&#61;"doc2_termvects",replace&#61;true}query&#61;"select word2vect.*from &word2VectDS word2vect, doc_termswhere _Document_&#61;&doc1_id and lowcase(term) &#61; _term_;";run; execdirect casout&#61;{name&#61;"doc2_terms",replace&#61;true}query&#61;"select doc_terms.*from &word2VectDS, doc_termswhere _Document_&#61;&doc1_id and lowcase(term) &#61; _term_;";run; simple.groupBy /table&#61;{name&#61;"doc2_terms"}inputs&#61;{"_Term_","_Count_"}aggregator&#61;"n"casout&#61;{name&#61;"doc2_termcount",replace&#61;true};run;quit;/* calculate Euclidean distance between words */datadoc1_termvects;setsascas1.doc1_termvects;run;datadoc2_termvects;setsascas1.doc2_termvects;run;proc iml;use doc1_termvects;read allvar_char_intolterm;read allvar_num_intox;closedoc1_termvects; use doc2_termvects;read allvar_char_intorterm;read allvar_num_intoy;closedoc2_termvects; d &#61; distance(x,y); lobs&#61;nrow(lterm);robs&#61;nrow(rterm);d_out&#61;j(lobs*robs, 3, &#39; &#39;);doi&#61;1to lobs;doj&#61;1to robs;d_out[(i-1)*robs&#43;j,1]&#61;lterm[i];d_out[(i-1)*robs&#43;j,2]&#61;rterm[j];d_out[(i-1)*robs&#43;j,3]&#61;cats(d[i,j]);end;end;createdistancefromd_out;appendfromd_out;closedistance;run;quit;/* calculate RWMD between documents */datax_set;setsascas1.doc1_termcount;rename_term_&#61;_node_;_weight_&#61;_count_;run;datay_set;setsascas1.doc2_termcount;rename_term_&#61;_node_;_weight_&#61;_count_;run;dataarcdata;setdistance;renamecol1&#61;_tail_;renamecol2&#61;_head_;length_cost_8;_cost_&#61; col3;run;proc optmodel;setxNODES;num w{xNODES};setyNODES;num u{yNODES};set ARCS;num arcCost{ARCS}; readdatax_setintoxNODES&#61;[_node_]w&#61;_weight_;readdatay_setintoyNODES&#61;[_node_]u&#61;_weight_;readdataarcdataintoARCS&#61;[_tail_ _head_]arcCost&#61;_cost_;varflow{inARCS}>&#61;0;impvar sumY &#61;sum{jinyNODES}u[j];minobj &#61;(sum{inARCS}arcCost[i,j]* flow[i,j])/sumY;con con_y{jinyNODES}:sum{inARCS}flow[i,j]&#61; u[j];/* con con_x {i in xNODES}: sum {<(i),j> in ARCS} flow[i,j] <&#61; w[i];*/solve with lp / algorithm&#61;ns scale&#61;none logfreq&#61;1;callsymput(&#39;obj&#39;, strip(put(obj,best.)));createdataflowDatafrom[i j]&#61;{inARCS: flow[i,j].sol >0}col("cost")&#61;arcCost[i,j]col("flowweight")&#61;flow[i,j].sol;run;quit;%putRWMD&#61;&obj;%mendcalculateRWMD; %calculateRWMD(textDS&#61;documents,documentID&#61;did,text&#61;text,language&#61;English,stopList&#61;stopList,word2VectDS&#61;glove,doc1_id&#61;1,doc2_id&#61;2);proc printdata&#61;flowdata;run;quit;
WMD方法不仅可以测量文档的相似性&#xff0c;还可以通过可视化流数据来解释为什么这两个文档是相似的。